# 데이터 조작
import numpy as np
import pandas as pd
# 변수 검정
from scipy import stats
# 시각화
import matplotlib.pyplot as plt
import seaborn as sns
# 경고 숨김
import warnings
warnings.filterwarnings('ignore')
# 한글폰트 사용
plt.rc('font', family = 'Malgun Gothic')
plt.rc('axes', unicode_minus = False)
# 레티나 설정
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
df = pd.read_csv('data/bank_customer_survey.csv')
print(df.shape)
df.head()
UCI Machine Learning Repository - Bank Marketing Data Set
<고객 개인 정보>
<은행 관련 정보>
<텔레마케팅 관련 정보>
<이전과 이번 마케팅 결과>
df.info()
df.isnull().sum()
# 'unknown' 도 결측치
print('job:', (df['job'] == 'unknown').sum())
print('education:', (df['education'] == 'unknown').sum())
print('contact:', (df['contact'] == 'unknown').sum())
print('poutcome:', (df['poutcome'] == 'unknown').sum())
85% 가 결측치이므로 poutcome 컬럼을 제거해야 합니다.
total = df['poutcome'].shape[0]
null = df.loc[(df['poutcome'] == 'unknown')|(df['poutcome'] == 'other'), 'poutcome'].shape[0]
print('결측치(%):', null / total * 100)
df['poutcome'].value_counts().plot.barh()
print(df.shape)
df = df[~df.duplicated()]
df.shape
df.nunique()
캠페인 결과로 정기 예금을 들지 않은 고객('No')이 88%로 데이터 불균형 문제가 있습니다.
print(df['y'].value_counts(normalize=True))
sns.countplot(data=df, x='y')
age 와 day 외 다른 연속형 변수는 모두 이상치가 확인됩니다.
num_cols = df.dtypes[df.dtypes != 'object'].index.tolist()[:-1]
print(num_cols)
df.describe()
age 외 대부분의 컬럼이 정규분포를 따르지 않아 차후 스케일링이 필요합니다.
h = df[num_cols].hist(figsize=(20, 15))
duration만 목표변수에 따라 큰 차이를 보이는 것 같아 보입니다.
num_cols.append('y')
sns.pairplot(df[num_cols], hue='y', diag_kws={'bw':5})
numeric_features = [
'age',
'balance',
'duration',
'campaign',
'pdays',
'previous'
]
len(numeric_features)
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
df[df.y == 0][numeric_features].hist(bins=35, color="blue", alpha=0.5, ax=axes)
df[df.y == 1][numeric_features].hist(bins=35, color="orange", alpha=0.7, ax=axes)
plt.legend(['No deposit', 'deposit'], shadow=True, loc=9)
변수간 상관계수가 높은 변수는 pdays 와 previous 입니다.
corr = df[num_cols].corr()
sns.heatmap(data=corr, annot=True, cmap='seismic', vmax=1, vmin=-1)
당연한 결과로 이어질 수 있는 'duration'변수는 삭제해야 합니다.
cat_cols = df.dtypes[df.dtypes == 'object'].index.tolist()
cat_cols
정기 예금을 신청한 고객 중에서는
반면에 정기 예금을 신청하지 않은 고객 중에서는
fig, axes = plt.subplots(ncols=3, nrows=3, figsize=(20, 20))
for i, col_name in enumerate(cat_cols):
row = i // 3
col = i % 3
sns.countplot(data=df, x=col_name, hue='y', ax=axes[row, col])
직업만 따로 보았습니다.
print(df[df.y==1]['job'].value_counts().sort_values(ascending=False))
plt.figure(figsize=(10, 5))
plt.xticks(rotation=45)
sns.countplot(data=df, x='job', hue='y')
각각의 컬럼별 계좌 잔고에 따른 정기 예금 신청 결과의 분포를 보았습니다.
fig = plt.figure(figsize=(20, 15))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(212)
g = sns.violinplot(data=df, x='default', y='balance', hue='y', ax=ax1)
g.set_title('채무 불이행 여부별 정기 예금 신청 결과')
g1 = sns.violinplot(data=df, x='education', y='balance', hue='y', ax=ax2)
g1.set_title('교육 수준별 정기 예금 신청 결과')
g2 = sns.violinplot(data=df, x='job', y='balance', hue='y', ax=ax3)
# g2.set_xticklabels(df['job'].unique(), rotation = 90, rotation_mode = 'anchor')
g2.set_title('직업별 정기 예금 신청 결과')
plt.show()
fig = plt.figure(figsize=(20, 15))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(212)
g = sns.violinplot(data=df, x='loan', y='balance', hue='y', ax=ax1)
g.set_title('개인 대출 여부별 정기 예금 신청 결과')
g1 = sns.violinplot(data=df, x='housing', y='balance', hue='y', ax=ax2)
g1.set_title('주택 담보 대출 여부별 정기 예금 신청 결과')
g2 = sns.violinplot(data=df, x='marital', y='balance', hue='y', ax=ax3)
# g2.set_xticklabels(df['job'].unique(), rotation = 90, rotation_mode = 'anchor')
g2.set_title('결혼 여부별 정기 예금 신청 결과')
plt.show()
정기 예금 고객만 확인해보았습니다.
h = df[df['y'] == 1].hist(figsize=(15, 15))
cat = df[cat_cols]
cat_y = cat[df['y'] == 1]
fig, axes = plt.subplots(ncols=3, nrows=3, figsize=(15, 15))
for i, col_name in enumerate(cat_cols):
row = i // 3
col = i % 3
sns.countplot(data=cat_y, x=col_name, ax=axes[row, col])
직업별 교육수준을 확인해보았습니다.
plt.figure(figsize=(16, 8))
sns.countplot(data=df, x='education', hue='job')
계좌 잔고가 높은 사람을 확인해보았습니다.
plt.figure(figsize=(15, 8))
sns.scatterplot(data=df, x='balance', y='age', hue='job')
plt.figure(figsize=(15, 8))
sns.scatterplot(data=df, x='balance', y='age', hue='marital')
plt.figure(figsize=(15, 8))
sns.scatterplot(data=df, x='balance', y='age', hue='education')
데이터가 2008년 5월부터 2010년 11월까지 수집되었고 시작된 달에 가장 많이 캠페인 활동이 있었습니다.
df.month.value_counts().plot.bar()
df_y1 = df[df['y']==1]
df_y1.month.value_counts().plot.bar()
values = [[142, 1403], [441, 2649], [248, 477], [577, 2932], [925, 13766], [546, 5341], [627, 6895], [688, 6247], [269, 579], [323, 738], [403, 3970], [100, 214]]
index = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
columns = ['true', 'counts']
months = pd.DataFrame(values, index=index, columns=columns)
print(months)
각각의 달에 따른 정기 예금 신청 결과 비율을 보았습니다.
months['prob'] = months['true'] / months['counts']
months
fig , ax0 = plt.subplots(figsize=(10, 10))
ax1 = ax0.twinx()
ax0.set_title("Month efficiency;")
ax0.plot(months["prob"] ,'r-', label = "prob" ,marker='o')
ax0.set_ylabel("prob")
ax0.grid(False)
ax1.plot(months["counts"] ,'b:', label ="count" , marker = 's')
ax1.set_ylabel("count")
ax1.grid(False)
ax0.set_xlabel("months")
fig.legend()
plt.show()


EDA를 통해 확인해보니 변수의 이상치 처리가 필요해보였습니다.
df['panswer'] = np.where(df['pdays'] == -1, 0, 1)
df['panswer'].value_counts()
del df['pdays']
df.previous.value_counts(ascending=True).head(3)
df[df.previous == 275]
pre_275 = df[df['previous'] == 275].index
df = df.drop(pre_275)
del df['day']
del df['duration']
del df['poutcome']
df.to_csv('telemarketing.csv', index=False)
df = pd.read_csv('telemarketing.csv')
df.head()
cat_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month']
bin_cols = df.nunique()[df.nunique() == 2].index.tolist()
print(bin_cols)
bin_cols = ['default', 'housing', 'loan']
bin_cols
multi_cols = [x for x in cat_columns if x not in bin_cols]
multi_cols
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in bin_cols:
df[i] = le.fit_transform(df[i])
df[bin_cols].head()
df = pd.get_dummies(data = df, columns = multi_cols, drop_first=True)
df
df.info()
'balance'변수는 이상치가 매우 크기 때문에 로그변환 후 standard scaling을 진행하였습니다.
df['balance'] = df['balance'] + 8020
df.balance.describe()
import math
from sklearn import preprocessing
df['balance'] = preprocessing.scale(np.log(df['balance']+1))
df.describe()
num_cols = ['balance', 'campaign', 'previous']
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
scaled = std.fit_transform(df[num_cols])
scaled = pd.DataFrame(scaled, columns = num_cols)
scaled.head()
print(df.shape)
df1 = df.drop(columns = num_cols, axis = 1)
df1 = df1.merge(scaled, left_index=True, right_index=True, how = "left")
print(df1.shape)
sns.set(rc={'figure.figsize':(12,10)})
sns.heatmap(df1.corr(), cmap="seismic", annot=False, vmin=-1, vmax=1)
df1.to_csv('survey_features.csv', index=False)
df1 = pd.read_csv('survey_features.csv')
df1.head()
X, y = df1.drop('y',axis=1), df1['y']
import statsmodels.api as sm
# 절편 정의
X = sm.add_constant(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=42)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# Dot to png
import pydot
# 의사결정 트리 선언
dTreeAll = DecisionTreeClassifier(random_state=0)
# 훈련 (모든 리프 노드 사용)
dTreeAll.fit(X_train, y_train)
# 점수 출력
print("Train Set Score1 : {:.2f}".format(dTreeAll.score(X_train, y_train)))
print("Test Set Score1 : {:.2f}".format(dTreeAll.score(X_test, y_test)))
# 의사결정 트리 선언(트리 깊이 제한)
dTreeLimit = DecisionTreeClassifier(max_depth=5, random_state=42)
# 훈련 (가지치기 : 리프노드 깊이 제한)
dTreeLimit.fit(X_train, y_train)
# 점수 출력
print("Train Set Score2 : {:.2f}".format(dTreeLimit.score(X_train, y_train)))
print("Test Set Score2 : {:.2f}".format(dTreeLimit.score(X_test, y_test)))
export_graphviz(dTreeLimit, out_file="dicisionTree1.dot", class_names=["No deposit","deposit"],
feature_names=df1.columns, impurity=False, filled=True)
# Encoding 중요
(graph,) = pydot.graph_from_dot_file('dicisionTree1.dot', encoding='utf8')
# Dot 파일을 Png 이미지로 저장
graph.write_png('dicisionTree1.png')
predictions = dTreeLimit.predict(X_test)
mf = dTreeLimit.feature_importances_
plt.figure(figsize=(10,5))
sns.barplot(y=X.columns, x= mf)
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
print('accuracy = {:.3f}'.format(accuracy_score(y_test, predictions) * 100))
print('f1_score = {:.3f}'.format(f1_score(y_test, predictions)))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
print(cm)
print(sns.heatmap(cm, annot=True, cmap='Blues'))
from sklearn.metrics import classification_report
cr = classification_report(y_test, predictions)
print(cr)
# fitting 로지스틱 회귀
import statsmodels.api as sm
pd.options.mode.chained_assignment = None
logitreg = sm.Logit(y_train, X_train)
logitreg_fit = logitreg.fit()
print(logitreg_fit.summary())
test_prob_logitreg = logitreg_fit.predict(X_test)
test_prob_logitreg
cols = X.columns
def back_feature_elem (data_frame,dep_var,col_list):
""" Takes in the dataframe, the dependent variable and a list of column names, runs the regression repeatedly eleminating feature with the highest
P-value above alpha one at a time and returns the regression summary with all p-values below alpha"""
while len(col_list)>0 :
model=sm.Logit(dep_var,data_frame[col_list])
result=model.fit(disp=0)
largest_pvalue=round(result.pvalues,3).nlargest(1)
if largest_pvalue[0]<(0.05):
return result
break
else:
col_list=col_list.drop(largest_pvalue.index)
result=back_feature_elem(X,y,cols)
result.summary()
params = np.exp(result.params)
conf = np.exp(result.conf_int())
conf['OR'] = params
pvalue=round(result.pvalues,3)
conf['pvalue']=pvalue
conf.columns = ['CI 95%(2.5%)', 'CI 95%(97.5%)', 'Odds Ratio','pvalue']
print ((conf))
def age_split(df1) :
if (df1['age'] > 17) & (df1['age'] <= 33) :
return '18-33'
elif (df1['age'] > 33) & (df1['age'] <= 48) :
return '34-48'
elif (df1['age'] > 48) & (df1['age'] <= 95) :
return '49-95'
df1['age_group'] = df1.apply(age_split, axis = 1)
del df1['age']
df1
age_group = ['age_group']
age_group
df1 = pd.get_dummies(data=df1, columns=age_group, drop_first=True)
df1
X, y = df1.drop('y',axis=1), df1['y']
import statsmodels.api as sm
X = sm.add_constant(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=42)
import statsmodels.api as sm
pd.options.mode.chained_assignment = None
logitreg = sm.Logit(y_train, X_train)
logitreg_fit = logitreg.fit()
print(logitreg_fit.summary())
cols = X.columns
def back_feature_elem (data_frame,dep_var,col_list):
""" Takes in the dataframe, the dependent variable and a list of column names, runs the regression repeatedly eleminating feature with the highest
P-value above alpha one at a time and returns the regression summary with all p-values below alpha"""
while len(col_list)>0 :
model=sm.Logit(dep_var,data_frame[col_list])
result=model.fit(disp=0)
largest_pvalue=round(result.pvalues,3).nlargest(1)
if largest_pvalue[0]<(0.05):
return result
break
else:
col_list=col_list.drop(largest_pvalue.index)
result=back_feature_elem(X,y,cols)
result.summary()
params = np.exp(result.params)
conf = np.exp(result.conf_int())
conf['OR'] = params
pvalue=round(result.pvalues,3)
conf['pvalue']=pvalue
conf.columns = ['CI 95%(2.5%)', 'CI 95%(97.5%)', 'Odds Ratio','pvalue']
print ((conf))
마케팅 대상
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100 , n_jobs=-1 , random_state=42)
rf.fit(X_train , y_train)
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
random_grid
rf_random = RandomizedSearchCV(estimator = rf,
param_distributions = random_grid,
n_iter = 3,
cv = 3,
random_state=42,
n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_
rf_model = rf_random.best_estimator_
rf_model
rf_model.fit(X_train , y_train)
rf_output = rf_model.predict(X_test)
feature_importances = rf_model.feature_importances_
plt.figure(figsize = (15,10))
sns.barplot(x =feature_importances , y = X.columns)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, rf_output) * 100
from sklearn.metrics import classification_report
cr = classification_report(y_test, rf_output)
print(cr)
features = result.params.index.to_list()[1:]
df1 = df1[features]
df1
y
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, fbeta_score
from sklearn.metrics import auc, roc_curve, roc_auc_score, precision_recall_curve
X = df1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)
def model_report(model_name, model):
print('\nSearch for OPTIMAL THRESHOLD, vary from 0.0001 to 0.9999, fit/predict on train/test data')
# X_train, y_train 으로 모델을 학습하는 코드를 작성해 주세요
model.fit(X_train, y_train)
optimal_th = 0.5 # start with default threshold value
for i in range(0,3):
score_list = []
print('\nLooping decimal place', i+1)
th_list = [np.linspace(optimal_th-0.4999, optimal_th+0.4999, 11),
# eg [ 0.0001 , 0.1008, 0.2006, 0.3004, 0.4002, 0.5, 0.5998, 0.6996, 0.7994, 0.8992, 0.9999 ]
np.linspace(optimal_th-0.1, optimal_th+0.1, 21),
# eg 0.3xx [ 0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 ]
np.linspace(optimal_th-0.01, optimal_th+0.01, 21)]
# eg 0.30x [ 0.29 , 0.291, 0.292, 0.293, 0.294, 0.295, 0.296, 0.297, 0.298, 0.299, 0.3 , 0.301, 0.302, 0.303, 0.304, 0.305, 0.306, 0.307, 0.308, 0.309, 0.31 ]
for th in th_list[i]:
y_pred = (model.predict_proba(X_test)[:,1] >= th) # predict_proba 로 예측하게 합니다.
f1scor = f1_score(y_test, y_pred) # y_test, y_pred 로 f1_score를 구해주세요.
score_list.append(f1scor)
print('{:.3f}->{:.4f}'.format(th, f1scor), end=', ') # display score in 4 decimal pl
optimal_th = float(th_list[i][score_list.index(max(score_list))])
print('optimal F1 score = {:.4f}'.format(max(score_list)))
print('optimal threshold = {:.3f}'.format(optimal_th))
print(model_name, 'accuracy score is')
print('Training: {:.2f}%'.format(100*model.score(X_train, y_train))) # score uses accuracy
print('Test set: {:.2f}%'.format(100*model.score(X_test, y_test))) # should use cross validation
y_pred = (model.predict_proba(X_test)[:,1] >= 0.25)
print('\nAdjust threshold to 0.25:')
print('Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}'.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, 'confusion matrix: \n', confusion_matrix(y_test, y_pred))
y_pred = model.predict(X_test)
print('\nDefault threshold of 0.50:')
print('Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}'.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, 'confusion matrix: \n', confusion_matrix(y_test, y_pred))
y_pred = (model.predict_proba(X_test)[:,1] >= 0.75)
print('\nAdjust threshold to 0.75:')
print('Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}'.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, 'confusion matrix: \n', confusion_matrix(y_test, y_pred))
y_pred = (model.predict_proba(X_test)[:,1] >= optimal_th)
print('\nOptimal threshold {:.3f}'.format(optimal_th))
print('Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}'.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, 'confusion matrix: \n', confusion_matrix(y_test, y_pred))
global model_f1, model_auc, model_ll, model_roc_auc
model_f1 = f1_score(y_test, y_pred)
y_pred = model.predict_proba(X_test)
model_ll = log_loss(y_test, y_pred)
print(model_name, 'Log-loss: {:.4f}'.format(model_ll))
y_pred = model.predict(X_test)
model_roc_auc = roc_auc_score(y_test, y_pred)
print(model_name, 'roc_auc_score: {:.4f}'.format(model_roc_auc))
y_pred = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
model_auc = auc(fpr, tpr)
print(model_name, 'AUC: {:.4f}'.format(model_auc))
# plot the ROC curve
plt.figure(figsize = [6,6])
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % model_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
# plt.savefig('roc_auc_score')
plt.show()
return
# initialise lists to collect the results to plot later
model_list = []
f1_list = []
auc_list = []
ll_list = []
roc_auc_list = []
time_list = []
import time
print('\n"""""" GaussianNB """"""')
time1 = time.time()
gnb = GaussianNB()
model_report('GaussianNB', gnb) # 위에서 작성한 모델 리포트 함수를 부릅니다.
model_list.append('GaussianNB')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() - time1)
print('\n"""""" BernoulliNB """"""')
time1 = time.time()
bnb = BernoulliNB()
model_report('BernoulliNB', bnb) # 위에서 작성한 모델 리포트 함수를 부릅니다.
model_list.append('BernoulliNB')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() - time1)
print('\n"""""" LogisticRegression """"""')
print('\nSearch for optimal hyperparameter C in LogisticRegresssion, vary C from 0.001 to 1000, using KFold(5) Cross Validation on train data')
kf = KFold(n_splits=5, random_state=21, shuffle=True) #produce the k folds
score_list = []
c_list = 10**np.linspace(-3,3,200)
for c in c_list:
logit = LogisticRegression(C = c)
cvs = (cross_val_score(logit, X_train, y_train, cv=kf, scoring='f1')).mean()
score_list.append(cvs)
print('{:.4f}'.format(cvs), end=", ") # 4 decimal pl
print('optimal cv F1 score = {:.4f}'.format(max(score_list)))
optimal_c = float(c_list[score_list.index(max(score_list))])
print('optimal value of C = {:.3f}'.format(optimal_c))
time1 = time.time()
logit = LogisticRegression(C = optimal_c)
model_report('LogisticRegression', logit)
model_list.append('LogisticRegression')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() - time1)
print('\n"""""" KNN """""" (quite slow)')
print('\nSearch for optimal hyperparameter K in KNN, vary K from 1 to 20, using KFold(5) Cross Validation on train data')
kf = KFold(n_splits=5, random_state=21, shuffle=True) #produce the k folds
k_scores = []
for k in range(1, 21):
knn = KNeighborsClassifier(n_neighbors = k)
cvs = cross_val_score(knn, X_train, y_train, cv=kf, scoring='f1').mean()
k_scores.append(cvs)
print('{:.4f}'.format(cvs), end=", ")
print('optimal cv F1 score = {:.4f}'.format(max(k_scores))) # 4 decimal pl
optimal_k = k_scores.index(max(k_scores))+1 # index 0 is for k=1
print('optimal value of K =', optimal_k)
time1 = time.time()
knn = KNeighborsClassifier(n_neighbors = optimal_k)
model_report('KNN', knn)
print('\nCompare with KNN classification_report (same as default threshold 0.50)')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
model_list.append('KNN')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() - time1)
print('\n"""""" DecisionTreeClassifier """"""')
print('\nSearch for optimal max_depth in DecisionTree, vary 2 to 10, using KFold(5) Cross Validation on train data')
kf = KFold(n_splits=5, random_state=21, shuffle=True) # produce the k folds
d_scores = []
for d in range(2, 11):
decisiontree = DecisionTreeClassifier(max_depth=d)
cvs = cross_val_score(decisiontree, X_train, y_train,
cv=kf, scoring='f1').mean()
d_scores.append(cvs)
print('{:.4f}'.format(cvs), end=", ")
print('optimal F1 score = {:.4f}'.format(max(d_scores))) # 4 decimal pl
optimal_d = d_scores.index(max(d_scores))+2 # index 0 is for d=2
print('optimal max_depth =', optimal_d)
time1 = time.time()
decisiontree = DecisionTreeClassifier(max_depth=optimal_d)
model_report('DecisionTreeClassifier', decisiontree)
model_list.append('DecisionTreeClassifier')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() - time1)
print('\n"""""" RandomForestClassifier """""" (quite slow)')
print('\nSearch for optimal n_estimators in RandomForest, vary 100 to 500, using KFold(5) Cross Validation on train data')
kf = KFold(n_splits=5, random_state=21, shuffle=True) #produce the k folds
score_list = []
n_list = []
for n in [100, 150, 200, 250, 300, 350, 400, 450, 500]:
randomforest = RandomForestClassifier(n_estimators=n)
cvs = (cross_val_score(randomforest, X_train, y_train, cv=kf, scoring='f1')).mean()
score_list.append(cvs)
n_list.append(n)
print('{:.0f}->{:.4f}'.format(n, cvs), end=", ") # display score in 4 decimal place
print('optimal F1 score = {:.4f}'.format(max(score_list)))
optimal_n = int(n_list[score_list.index(max(score_list))])
print('optimal n_estimators = {:.0f}'.format(optimal_n))
time1 = time.time()
randomforest = RandomForestClassifier(n_estimators=optimal_n)
model_report('RandomForestClassifier', randomforest)
model_list.append('RandomForestClassifier')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() - time1)
print('\n"""""" LinearSVC """"""')
time1 = time.time()
linearsvc = LinearSVC()
# model_report('LinearSVC', linearsvc) # model has no attribute 'predict_proba'
linearsvc.fit(X_train, y_train)
print('LinearSVC accuracy score is')
print('Training: {:.2f}%'.format(100*linearsvc.score(X_train, y_train))) # score uses accuracy
print('Test set: {:.2f}%'.format(100*linearsvc.score(X_test, y_test))) # should use cross validation
y_pred = linearsvc.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print('LinearSVC confusion matrix: \n', confusion_matrix(y_test, y_pred))
model_f1 = f1_score(y_test, y_pred)
model_ll = log_loss(y_test, y_pred)
print('LinearSVC Log-loss: {:.4f}'.format(model_ll))
model_roc_auc = roc_auc_score(y_test, y_pred)
print('LinearSVC roc_auc_score: {:.4f}'.format(model_roc_auc))
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
model_auc = auc(fpr, tpr)
print('LinearSVC AUC: {:.4f}'.format(model_auc))
# plot the ROC curve
plt.figure(figsize = [6,6])
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % model_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
# plt.savefig('roc_auc_score')
plt.show()
model_list.append('LinearSVC')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() - time1)
print('\n"""""" SVC """""" (extremely slow)')
time1 = time.time()
svc = SVC(gamma='scale', probability=True)
model_report('SVC', svc)
model_list.append('SVC')
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
# time_list.append(time.time() - time1) # use this line for actual time spent, or
time_list.append(0) # use this line to be able to see time spent for other models
## plot the classification report scores
fig, ax = plt.subplots(5, 1, figsize=(18, 28))
# fig.set_figwidth(10)
# fig.set_figheight(6)
# fig.suptitle('Main Title',fontsize = 16)
ax[0].bar(model_list, f1_list)
ax[0].set_title('F1-score')
ax[1].bar(model_list, auc_list)
ax[1].set_title('AUC-score');
ax[2].bar(model_list, ll_list)
ax[2].set_title('Log-Loss-Score')
ax[3].bar(model_list, roc_auc_list)
ax[3].set_title('ROC AUC Score')
ax[4].bar(model_list, time_list)
ax[4].set_title('Time taken')
# Fine-tune figure; make subplots farther from each other, or nearer to each other.
fig.subplots_adjust(hspace=0.2, wspace=0.2)
model_list
# plot the ROC curves
plt.figure(figsize=(10,10))
y_pred = gnb.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='blue',
lw=3, label='GaussianNB (area = %0.2f)' % auc_list[0])
y_pred = bnb.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='green',
lw=3, label='BernoulliNB (area = %0.2f)' % auc_list[1])
y_pred = logit.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='red',
lw=2, label='LogisticRegression (area = %0.2f)' % auc_list[2])
y_pred = decisiontree.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='purple',
lw=2, label='DecisionTree (area = %0.2f)' % auc_list[3])
y_pred = randomforest.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='brown',
lw=2, label='RandomForest (area = %0.2f)' % auc_list[4])
y_pred = linearsvc.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='cyan',
lw=2, label='LinearSVC (area = %0.2f)' % auc_list[5])
y_pred = svc.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='magenta',
lw=2, label='SVC (area = %0.2f)' % auc_list[6])
y_pred = knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color='yellow',
lw=3, label='KNN (area = %0.2f)' % auc_list[7])
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('Receiver Operating Characteristic', fontsize=17)
plt.legend(loc='lower right', fontsize=13)
plt.show()